{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8a7d35bf",
   "metadata": {},
   "source": [
    "# Blended Queries in Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cfd263bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import tika\n",
    "tika.initVM()\n",
    "from tika import parser  \n",
    "import re\n",
    "from datetime import date\n",
    "import pandas as pd\n",
    "import json\n",
    "from datetime import datetime\n",
    "import requests\n",
    "import PyPDF2\n",
    "\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f402137e",
   "metadata": {},
   "outputs": [],
   "source": [
    "ES_URL ='Add your endpoint here'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1000ba8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "\n",
    "es = Elasticsearch(\n",
    "    'Add your Elastic Search endpoint here',\n",
    "     headers={'Authorization': 'apikey  Add your API key here'},\n",
    "    \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "66f19035",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'instance-0000000001', 'cluster_name': '8f0e4b50fdb54cbdbaa9768dce87cce0', 'cluster_uuid': 'KNkOkY30RoGT2iZ-ewHtPA', 'version': {'number': '8.8.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f8edfccba429b6477927a7c1ce1bc6729521305e', 'build_date': '2023-06-05T21:32:25.188464208Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "info = es.info()\n",
    "print(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "id": "1a7235aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Create a pipline for 144 questions\n",
    "\n",
    "#!pip3 install openpyxl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d3091274",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                              question  Category   \\\n",
      "0    How do I get ready for using Sourcing and Proc...  Indirect    \n",
      "1    What features does SAP Sourcing and Procuremen...     Direct   \n",
      "2    What integration scenarios are available for S...     Direct   \n",
      "3    Can I connect my SAP S/4HANA Cloud with extern...  Indirect    \n",
      "4                    What does Central Procurement do?     Direct   \n",
      "..                                                 ...        ...   \n",
      "139     How can I claim my travel expenses in Concur?     Deviant   \n",
      "140                How can I claim my travel expenses?    Deviant   \n",
      "141                How can I place an order in Ariba?     Deviant   \n",
      "142                 What is our USP against Microsoft?    Deviant   \n",
      "143               When is the birthday of my manager?     Deviant   \n",
      "\n",
      "                                                answer  \\\n",
      "0    I do not have information regarding Sourcing a...   \n",
      "1    I do not have information regarding SAP Sourci...   \n",
      "2    Integration of SAP S/4HANA Procurement Cloud w...   \n",
      "3    Yes, you can. The integration of the procureme...   \n",
      "4    I do not have information regarding Central Pr...   \n",
      "..                                                 ...   \n",
      "139  I do not have information regarding travel exp...   \n",
      "140  I do not have information regarding travel exp...   \n",
      "141  I do not have information regarding how to pla...   \n",
      "142  I do not have information regarding USP agains...   \n",
      "143  I do not have information regarding the birthd...   \n",
      "\n",
      "                                         golden_anwser  \\\n",
      "0      Activate the Scope Items in SAP Best Practices.   \n",
      "1    Pricing, Manage Teams and Responsibilities - P...   \n",
      "2    \\nIntegration of SAP S/4HANA Procurement Cloud...   \n",
      "3    Yes. Integration of the procurement functional...   \n",
      "4    Integrate your SAP S/4HANA Cloud system with o...   \n",
      "..                                                 ...   \n",
      "139                                    I do not know.    \n",
      "140                                    I do not know.    \n",
      "141                                    I do not know.    \n",
      "142                                    I do not know.    \n",
      "143                                    I do not know.    \n",
      "\n",
      "                                                PROMPT  wrong_urls  \\\n",
      "0    Sourcing and Procurement Where can I find more...       False   \n",
      "1    Manufacturing Manufacturing in SAP S/4HANA Clo...        True   \n",
      "2    Integration and APIs Integration in Sourcing a...       False   \n",
      "3    Integration of Procurement with External Suppl...       False   \n",
      "4    Restart failed jobs  Dismiss failed jobs from ...       False   \n",
      "..                                                 ...         ...   \n",
      "139  How predictive accounting works for travel exp...       False   \n",
      "140  Travel Expenses Scenarios In travel expenses s...       False   \n",
      "141  Details for Order Confirmations (J82) On the A...       False   \n",
      "142  Offline Mode Download purchase contract inform...       False   \n",
      "143  Manager Assignment Read about how to assign a ...       False   \n",
      "\n",
      "     wrong_passages    evaluation  \\\n",
      "0             False     incorrect   \n",
      "1              True     incorrect   \n",
      "2             False  insufficient   \n",
      "3             False       correct   \n",
      "4             False     incorrect   \n",
      "..              ...           ...   \n",
      "139           False       correct   \n",
      "140           False       correct   \n",
      "141           False       correct   \n",
      "142           False       correct   \n",
      "143           False       correct   \n",
      "\n",
      "                                              comments  passage_1  ...  \\\n",
      "0                                                  NaN        NaN  ...   \n",
      "1    Focus on India rather than on Sourcing and Pro...        NaN  ...   \n",
      "2                                                  NaN        NaN  ...   \n",
      "3                                                  NaN        NaN  ...   \n",
      "4                                                  NaN        NaN  ...   \n",
      "..                                                 ...        ...  ...   \n",
      "139                                                NaN        NaN  ...   \n",
      "140                                                NaN        NaN  ...   \n",
      "141                                                NaN        NaN  ...   \n",
      "142                                                NaN        NaN  ...   \n",
      "143                                                NaN        NaN  ...   \n",
      "\n",
      "                                    reranker_passage_2  \\\n",
      "0    Sourcing and Procurement. Sourcing and Procure...   \n",
      "1    Sourcing and Procurement. Sourcing and Procure...   \n",
      "2    Adapt and Extend Functionality of Apps (Extens...   \n",
      "3    Integration of Purchase Scheduling Agreement w...   \n",
      "4    All About Central Procurement. Central Procure...   \n",
      "..                                                 ...   \n",
      "139  Videos for Predictive Accounting. Open this vi...   \n",
      "140  Example: Predictive Accounting Based on a Trav...   \n",
      "141  Predecessor: Automated Purchase-to-Pay with SA...   \n",
      "142  Export the Summary of Test Processes. The summ...   \n",
      "143  My Store Overview. My Store Overview Store man...   \n",
      "\n",
      "                                 reranker_passage_2_id  \\\n",
      "0    https://help.sap.com/docs/SAP_S4HANA_CLOUD/634...   \n",
      "1    https://help.sap.com/docs/SAP_S4HANA_CLOUD/634...   \n",
      "2    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "3    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "4    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "..                                                 ...   \n",
      "139  https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b3...   \n",
      "140  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...   \n",
      "141  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "142  https://help.sap.com/docs/SAP_S4HANA_CLOUD/1e9...   \n",
      "143  https://help.sap.com/docs/SAP_S4HANA_CLOUD/646...   \n",
      "\n",
      "                                    reranker_passage_3  \\\n",
      "0    Adapt and Extend Functionality of Apps (Extens...   \n",
      "1    Data Protection in Sourcing and Procurement. D...   \n",
      "2    All About Sourcing. If you have any questions ...   \n",
      "3    FAQ for Sales Order Management and Processing....   \n",
      "4    Central Sourcing. Note If Central Procurement ...   \n",
      "..                                                 ...   \n",
      "139  Integration Between Travel and Expense Managem...   \n",
      "140  Predictive Accounting in Travel and Expense Ma...   \n",
      "141  Purchase Order Collaboration. Purchase Order C...   \n",
      "142  Integration with Microsoft Teams. Integration ...   \n",
      "143  Person Work Agreement Manager for a Given Date...   \n",
      "\n",
      "                                 reranker_passage_3_id  \\\n",
      "0    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "1    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "2    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "3    https://help.sap.com/docs/SAP_S4HANA_CLOUD/03c...   \n",
      "4    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "..                                                 ...   \n",
      "139  https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b3...   \n",
      "140  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...   \n",
      "141  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "142  https://help.sap.com/docs/SAP_S4HANA_CLOUD/ff0...   \n",
      "143  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0f6...   \n",
      "\n",
      "                              elastic_search_passage_1  \\\n",
      "0    Sourcing and Procurement Where can I find more...   \n",
      "1    Ii is uniquely identified together with the pr...   \n",
      "2    Central Procurement With Central Procurement, ...   \n",
      "3    External Sales and Use Tax Communication Scena...   \n",
      "4    More Information  Central Requisitioning  Cent...   \n",
      "..                                                 ...   \n",
      "139  Line item Amount G/L account 337G F9492AEA5DBA...   \n",
      "140  Line item Amount G/L account 337G F9492AEA5DBA...   \n",
      "141  Discount Management (42K) Payment ProposalSAP ...   \n",
      "142  Integration with SAP Analysis for Microsoft Of...   \n",
      "143  Date Choose this field type if you want to, fo...   \n",
      "\n",
      "                           elastic_search_passage_1_id  \\\n",
      "0    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "1    https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...   \n",
      "2    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "3    https://help.sap.com/docs/SAP_S4HANA_CLOUD/fd8...   \n",
      "4    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "..                                                 ...   \n",
      "139  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...   \n",
      "140  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...   \n",
      "141  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "142  https://help.sap.com/docs/SAP_S4HANA_CLOUD/90c...   \n",
      "143  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0f6...   \n",
      "\n",
      "                              elastic_search_passage_2  \\\n",
      "0    SAP S/4HANA Cloud Connected with SAP Ariba App...   \n",
      "1    Note If Central Procurement with Ariba Sourcin...   \n",
      "2    SAP S/4HANA Cloud Connected with SAP Ariba App...   \n",
      "3    You can use an A2X service to connect SAP S/4H...   \n",
      "4    Contract Before Saving (MM_PUR_S4_CCTR_CHECK) ...   \n",
      "..                                                 ...   \n",
      "139  Line item Amount G/L account Cost center Profi...   \n",
      "140  Line item Amount G/L account Cost center Profi...   \n",
      "141  If quick enablement is enabled in Ariba Networ...   \n",
      "142  Configuration Settings The following configura...   \n",
      "143  Legal Counsel (Role) Accountable for the lifec...   \n",
      "\n",
      "                           elastic_search_passage_2_id  \\\n",
      "0    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "1    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "2    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "3    https://help.sap.com/docs/SAP_S4HANA_CLOUD/03c...   \n",
      "4    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "..                                                 ...   \n",
      "139  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...   \n",
      "140  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...   \n",
      "141  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
      "142  https://help.sap.com/docs/SAP_S4HANA_CLOUD/ff0...   \n",
      "143  https://help.sap.com/docs/SAP_S4HANA_CLOUD/ff0...   \n",
      "\n",
      "                              elastic_search_passage_3  \\\n",
      "0    Central Procurement with SAP Ariba Sourcing (4...   \n",
      "1    Data Protection in Sourcing and Procurement Da...   \n",
      "2    Central Procurement with SAP Ariba Sourcing (4...   \n",
      "3    SAP S/4HANA Field Logistics  SAP Fieldglass  S...   \n",
      "4    Procurement: Purchasing Document Procurement: ...   \n",
      "..                                                 ...   \n",
      "139  Example: Predictive Accounting with an Itemize...   \n",
      "140  for the initial predictive journal entry and f...   \n",
      "141  Purchase Order - Create, Write, Delete (Purcha...   \n",
      "142  Exporting Your Data  Context You can export ev...   \n",
      "143  Staffed hours Resource managers can check the ...   \n",
      "\n",
      "                           elastic_search_passage_3_id  \n",
      "0    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...  \n",
      "1    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...  \n",
      "2    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...  \n",
      "3    https://help.sap.com/docs/SAP_S4HANA_CLOUD/74f...  \n",
      "4    https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...  \n",
      "..                                                 ...  \n",
      "139  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...  \n",
      "140  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bd3...  \n",
      "141  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...  \n",
      "142  https://help.sap.com/docs/SAP_S4HANA_CLOUD/d56...  \n",
      "143  https://help.sap.com/docs/SAP_S4HANA_CLOUD/29c...  \n",
      "\n",
      "[144 rows x 39 columns]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# Read Excel file\n",
    "df = pd.read_excel('1686845308-EVALUATION_GOLD_URLS.xlsx')\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "98c37d91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['question', 'Category ', 'answer', 'golden_anwser', 'PROMPT',\n",
       "       'wrong_urls', 'wrong_passages', 'evaluation', 'comments', 'passage_1',\n",
       "       'passage_1_id', 'passage_2', 'passage_2_id', 'passage_3',\n",
       "       'passage_3_id', 'discovery_passage_1', 'discovery_passage_1_id',\n",
       "       'discovery_passage_2', 'discovery_passage_2_id', 'discovery_passage_3',\n",
       "       'discovery_passage_3_id', 'proposed-gold-url-1', 'proposed-gold-url-2',\n",
       "       'proposed-gold-url-3', 'gold-url-1', 'gold-url-2', 'gold-url-3',\n",
       "       'reranker_passage_1', 'reranker_passage_1_id', 'reranker_passage_2',\n",
       "       'reranker_passage_2_id', 'reranker_passage_3', 'reranker_passage_3_id',\n",
       "       'elastic_search_passage_1', 'elastic_search_passage_1_id',\n",
       "       'elastic_search_passage_2', 'elastic_search_passage_2_id',\n",
       "       'elastic_search_passage_3', 'elastic_search_passage_3_id'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "76c15b0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sap = df[['question','Category ','proposed-gold-url-1', 'proposed-gold-url-2',\n",
    "       'proposed-gold-url-3', 'gold-url-1', 'gold-url-2', 'gold-url-3']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1adf53d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['question', 'Category ', 'proposed-gold-url-1', 'proposed-gold-url-2',\n",
       "       'proposed-gold-url-3', 'gold-url-1', 'gold-url-2', 'gold-url-3'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sap.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c520bdb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/766358501.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap.rename(columns = {'Category ':'Category'}, inplace = True)\n"
     ]
    }
   ],
   "source": [
    "df_sap.rename(columns = {'Category ':'Category'}, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "57a3adf6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Indirect ', 'Direct', 'Ambiguous ', 'Aggregated', 'Comparative',\n",
       "       'Deviant'], dtype=object)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sap.Category.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c3b1c5c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/1230006049.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap['url1'] =df_sap['gold-url-1']\n",
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/1230006049.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap['url2'] =df_sap['gold-url-2']\n",
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/1230006049.py:4: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap['url3'] =df_sap['gold-url-3']\n"
     ]
    }
   ],
   "source": [
    "### Collect the questions based on Category\n",
    "df_sap['url1'] =df_sap['gold-url-1']\n",
    "df_sap['url2'] =df_sap['gold-url-2']\n",
    "df_sap['url3'] =df_sap['gold-url-3']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3c6fed19",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/3898582753.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap['score1'] =df_sap['gold-url-1']\n",
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/3898582753.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap['score2'] =df_sap['gold-url-2']\n",
      "/var/folders/lp/ydkpkqws5mv6ptd1fy7p5w_40000gn/T/ipykernel_30661/3898582753.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_sap['score3'] =df_sap['gold-url-3']\n"
     ]
    }
   ],
   "source": [
    "df_sap['score1'] =df_sap['gold-url-1']\n",
    "df_sap['score2'] =df_sap['gold-url-2']\n",
    "df_sap['score3'] =df_sap['gold-url-3']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "bbadc22b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['question', 'Category', 'proposed-gold-url-1', 'proposed-gold-url-2',\n",
       "       'proposed-gold-url-3', 'gold-url-1', 'gold-url-2', 'gold-url-3', 'url1',\n",
       "       'url2', 'url3', 'score1', 'score2', 'score3'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sap.columns"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "367c84a9",
   "metadata": {},
   "source": [
    "## Query Examples\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e0a7c1b2",
   "metadata": {},
   "source": [
    "### 1. Elser with Boost "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "af463115",
   "metadata": {},
   "source": [
    "search_query1 = {\n",
    "           \"query\":{\n",
    "              \"text_expansion\":{\n",
    "                 \"ml.tokens\":{\n",
    "                    \"model_id\":\".elser_model_1\",\n",
    "                    \"model_text\":\"\",\n",
    "                     \"boost\":10\n",
    "                 }\n",
    "              }\n",
    "           },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0ccc76f",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "           \"query\":{\n",
    "              \"text_expansion\":{\n",
    "                 \"ml.tokens\":{\n",
    "                    \"model_id\":\".elser_model_1\",\n",
    "                    \"model_text\":\"\",\n",
    "                     \"boost\":10\n",
    "                 }\n",
    "              }\n",
    "           },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "c09e2b86",
   "metadata": {},
   "source": [
    "### 2. Elser without BOOST \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e91a0b2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "search_query1 = {\n",
    "           \"query\":{\n",
    "              \"text_expansion\":{\n",
    "                 \"ml.tokens\":{\n",
    "                    \"model_id\":\".elser_model_1\",\n",
    "                    \"model_text\":\"\"\n",
    "                 }\n",
    "              }\n",
    "           },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "48286c89",
   "metadata": {},
   "source": [
    "### 2. MULTIMATCH \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a50dd2f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "  \"query\": {\n",
    "   \"multi_match\" : {\n",
    "      \"query\":]\"\",\n",
    "      \"type\":\"cross_fields\",\n",
    "      \"analyzer\":\"standard\", \n",
    "      \"fields\":[ \"title\", \"text\"]\n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "  \n",
    "}"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "092f1d8c",
   "metadata": {},
   "source": [
    "### 3. MULTIMATCH with BOOST \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "847094cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "  \"query\": {\n",
    "   \"multi_match\" : {\n",
    "      \"query\":\"\",\n",
    "      \"type\":\"cross_fields\",\n",
    "      \"analyzer\":\"standard\", \n",
    "      \"fields\":[ \"title\", \"text\"],\n",
    "       \"boost\":10\n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "  \n",
    "}"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "3e09f222",
   "metadata": {},
   "source": [
    "### 4. without analyser\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f0d179c",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "  \"query\": {\n",
    "   \"multi_match\" : {\n",
    "      \"query\":\"\",\n",
    "      \"type\":\"cross_fields\",\n",
    "      \"fields\":[ \"title\", \"text\"],\n",
    "       \"boost\":10\n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "  \n",
    "}"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "cd247e13",
   "metadata": {},
   "source": [
    "### 5. Best Field\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "148cd2f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "  \"query\": {\n",
    "   \"multi_match\" : {\n",
    "      \"query\":\"\",\n",
    "      \"type\":\"best_fields\",\n",
    "      \"fields\":[ \"title\", \"text\"],\n",
    "       \"tie_breaker\": 0.3\n",
    "       \n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "  \n",
    "}"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "91e50849",
   "metadata": {},
   "source": [
    "### 6. bool query\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e16bb8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "  \"query\": {\n",
    "   \"bool\": {\n",
    "      \"must\": {\n",
    "        \"match\": {      \n",
    "          \"text\": \"How can I park supplier Invoices\"\n",
    "      }\n",
    "    }\n",
    "  \n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "  \n",
    "}\n",
    "\n",
    "{\n",
    "  \"query\": {\n",
    "    \n",
    "}\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5a5b7e5b",
   "metadata": {},
   "source": [
    "### 7. Elser with Cross field\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5431e2cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1={\n",
    "  \"query\": {\n",
    "    \"bool\": { \n",
    "      \"should\": [\n",
    "        {\n",
    "          \"text_expansion\": {\n",
    "            \"ml.tokens\": {\n",
    "              \"model_text\": \"Where do I find the APIs that are availbe in Sourcing and Procurment?\",\n",
    "              \"model_id\": \".elser_model_1\"\n",
    "            }\n",
    "          }\n",
    "        }\n",
    "      ],\n",
    "       \"must\": {\n",
    "        \"multi_match\" : {\n",
    "        \"query\":      \"Where do I find the APIs that are availbe in Sourcing and Procurment?\",\n",
    "        \"type\":       \"cross_fields\",\n",
    "        \"analyzer\":   \"standard\", \n",
    "        \"fields\":     [ \"title\", \"text\"]\n",
    "    }\n",
    "    }\n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f773d77c",
   "metadata": {},
   "source": [
    "### 8. Elser with best field\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "961912dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1={\n",
    "          \"query\": {\n",
    "            \"bool\": { \n",
    "              \"should\": [\n",
    "                {\n",
    "                  \"text_expansion\": {\n",
    "                    \"ml.tokens\": {\n",
    "                      \"model_text\":question,\n",
    "                      \"model_id\": \".elser_model_1\"\n",
    "                    }\n",
    "                  }\n",
    "                }\n",
    "              ],\n",
    "               \"must\": {\n",
    "                \"multi_match\" : {\n",
    "                \"query\":question,\n",
    "                \"type\":\"best_fields\",\n",
    "                \"fields\":[ \"title\", \"text\"],\n",
    "                \"tie_breaker\": 0.3\n",
    "            }\n",
    "            }\n",
    "            }\n",
    "              },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1afbe962",
   "metadata": {},
   "outputs": [],
   "source": [
    "### \n",
    "\n",
    "search_query1 = {\n",
    "  \"query\": {\n",
    "   \"multi_match\" : {\n",
    "      \"query\":\"\",\n",
    "      \"type\":\"phrase_prefix\",\n",
    "      \"fields\":[ \"title\", \"text\"]\n",
    "       \n",
    "    }\n",
    "  },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "  \n",
    "}\n",
    "\n",
    " search_query1={\n",
    "          \"query\": {\n",
    "            \"bool\": { \n",
    "              \"should\": [\n",
    "                {\n",
    "                  \"text_expansion\": {\n",
    "                    \"ml.tokens\": {\n",
    "                      \"model_text\":question,\n",
    "                      \"model_id\": \".elser_model_1\"\n",
    "                    }\n",
    "                  }\n",
    "                }\n",
    "              ],\n",
    "               \"must\": {\n",
    "                \"multi_match\" : {\n",
    "                \"query\":question,\n",
    "                \"type\":\"phrase_prefix\",\n",
    "                \"fields\":[ \"title\", \"text\"]\n",
    "            }\n",
    "            }\n",
    "            }\n",
    "              },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8a2035be",
   "metadata": {},
   "source": [
    "### 9. bool_prefix with Elser\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ffe7632",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query = {\n",
    "          \"query\": {\n",
    "            \"bool\": { \n",
    "              \"should\": [\n",
    "                {\n",
    "                  \"text_expansion\": {\n",
    "                    \"ml.tokens\": {\n",
    "                      \"model_text\":question,\n",
    "                      \"model_id\": \".elser_model_1\"\n",
    "                    }\n",
    "                  }\n",
    "                }\n",
    "              ],\n",
    "               \"must\": {\n",
    "                \"multi_match\" : {\n",
    "                \"query\":question,\n",
    "                \"type\":\"bool_prefix\",\n",
    "                \"fields\":[ \"title\", \"text\"]\n",
    "            }\n",
    "            }\n",
    "            }\n",
    "              },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "5d0740c9",
   "metadata": {},
   "source": [
    "### 10. Elser with boost fields\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33f7948c",
   "metadata": {},
   "outputs": [],
   "source": [
    "{\n",
    "          \"query\": {\n",
    "            \"bool\": { \n",
    "              \"should\": [\n",
    "                {\n",
    "                  \"text_expansion\": {\n",
    "                    \"ml.tokens\": {\n",
    "                      \"model_text\":question,\n",
    "                      \"model_id\": \".elser_model_1\"\n",
    "                    }\n",
    "                  }\n",
    "                }\n",
    "              ],\n",
    "               \"must\": {\n",
    "                \"multi_match\" : {\n",
    "                \"query\":question,\n",
    "                \"type\":\"most_fields\",\n",
    "                \"fields\":[ \"title\", \"text\"]\n",
    "            }\n",
    "            }\n",
    "            }\n",
    "              },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }\n",
    "\n",
    "{\n",
    "          \"query\": {\n",
    "            \"bool\": { \n",
    "              \"should\": [\n",
    "                {\n",
    "                  \"text_expansion\": {\n",
    "                    \"ml.tokens\": {\n",
    "                      \"model_text\":question,\n",
    "                      \"model_id\": \".elser_model_1\"\n",
    "                    }\n",
    "                  }\n",
    "                }\n",
    "              ],\n",
    "               \"must\": {\n",
    "                \"multi_match\" : {\n",
    "                \"query\":question,\n",
    "                \"type\":\"most_fields\",\n",
    "                \"fields\":[ \"title\", \"text\"]\n",
    "            }\n",
    "            }\n",
    "            }\n",
    "              },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "        }"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "eedbc04f",
   "metadata": {},
   "source": [
    "## Question embedding\n",
    "#### sentence-transformers__all-minilm-l6-v2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "4241f44a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def processESIndex(df_sap):\n",
    "    i =0\n",
    "    for ind in df_sap.index:\n",
    "        print(\"Processsing -----\",i)\n",
    "        question =df_sap['question'][ind]\n",
    "        search_query1 = {\n",
    "          \"query\": {\n",
    "           \"multi_match\" : {\n",
    "              \"query\":question,\n",
    "              \"type\":\"bool_prefix\",\n",
    "              \"fields\":[ \"title\", \"text\"]\n",
    "\n",
    "            }\n",
    "          },\n",
    "          \"min_score\": 10 ,\n",
    "          \"fields\": [\n",
    "            \"filePath\",\n",
    "            \"url\"\n",
    "          ],\n",
    "          \"_source\": False\n",
    "\n",
    "        }\n",
    "        response = es.search(\n",
    "        index='elser-s4hana-business-39k-512t-256t-2023-07-26',\n",
    "        body=search_query1,\n",
    "        scroll='5m',  # Set the scroll timeout (e.g., 5 minutes)\n",
    "        size=3  # Set the number of documents to retrieve per scroll\n",
    "        )\n",
    "        all_hits = response['hits']['hits']\n",
    "        print(len(all_hits))\n",
    "        j=0\n",
    "        for num, doc in enumerate(all_hits):\n",
    "            print (\"DOC ID:\", doc[\"_id\"], \"--->\", doc, type(doc), \"\\n\")\n",
    "            print (\"DOC Score:\", doc[\"_score\"], \"\\n\")\n",
    "            print (\"DOC url :\", doc[\"fields\"]['url'], \"\\n\")\n",
    "            if j==0:\n",
    "                df_sap['url1'][ind] =doc[\"fields\"]['url']\n",
    "                df_sap['score1'][ind] =doc[\"_score\"]\n",
    "            elif j==1:\n",
    "                df_sap['url2'][ind] =doc[\"fields\"]['url']\n",
    "                df_sap['score2'][ind] =doc[\"_score\"]\n",
    "            elif j==2:\n",
    "                df_sap['url3'][ind] =doc[\"fields\"]['url']\n",
    "                df_sap['score3'][ind] =doc[\"_score\"]\n",
    "            j=j+1\n",
    "        i =i+1\n",
    "    return df_sap\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb1ffa0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sap =processESIndex(df_sap)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "b66b644c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['question', 'Category', 'proposed-gold-url-1', 'proposed-gold-url-2',\n",
       "       'proposed-gold-url-3', 'gold-url-1', 'gold-url-2', 'gold-url-3', 'url1',\n",
       "       'url2', 'url3', 'score1', 'score2', 'score3'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sap.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "81e47213",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>Category</th>\n",
       "      <th>proposed-gold-url-1</th>\n",
       "      <th>proposed-gold-url-2</th>\n",
       "      <th>proposed-gold-url-3</th>\n",
       "      <th>gold-url-1</th>\n",
       "      <th>gold-url-2</th>\n",
       "      <th>gold-url-3</th>\n",
       "      <th>url1</th>\n",
       "      <th>url2</th>\n",
       "      <th>url3</th>\n",
       "      <th>score1</th>\n",
       "      <th>score2</th>\n",
       "      <th>score3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>How do I get ready for using Sourcing and Proc...</td>\n",
       "      <td>Indirect</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>45.986645</td>\n",
       "      <td>44.425552</td>\n",
       "      <td>40.902496</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>What features does SAP Sourcing and Procuremen...</td>\n",
       "      <td>Direct</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/63...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>28.099304</td>\n",
       "      <td>27.13894</td>\n",
       "      <td>26.921707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>What integration scenarios are available for S...</td>\n",
       "      <td>Direct</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/b2...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/b2...</td>\n",
       "      <td>50.218422</td>\n",
       "      <td>48.836723</td>\n",
       "      <td>48.836723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Can I connect my SAP S/4HANA Cloud with extern...</td>\n",
       "      <td>Indirect</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b...</td>\n",
       "      <td>53.618404</td>\n",
       "      <td>50.663227</td>\n",
       "      <td>49.49715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What does Central Procurement do?</td>\n",
       "      <td>Direct</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>[https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...</td>\n",
       "      <td>25.9769</td>\n",
       "      <td>22.494854</td>\n",
       "      <td>22.246145</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question   Category  \\\n",
       "0  How do I get ready for using Sourcing and Proc...  Indirect    \n",
       "1  What features does SAP Sourcing and Procuremen...     Direct   \n",
       "2  What integration scenarios are available for S...     Direct   \n",
       "3  Can I connect my SAP S/4HANA Cloud with extern...  Indirect    \n",
       "4                  What does Central Procurement do?     Direct   \n",
       "\n",
       "                                 proposed-gold-url-1 proposed-gold-url-2  \\\n",
       "0  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...                 NaN   \n",
       "1  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...                 NaN   \n",
       "2  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...                 NaN   \n",
       "3  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...                 NaN   \n",
       "4  https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9...                 NaN   \n",
       "\n",
       "  proposed-gold-url-3                                         gold-url-1  \\\n",
       "0                 NaN  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
       "1                 NaN  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
       "2                 NaN  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
       "3                 NaN  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
       "4                 NaN  https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e6...   \n",
       "\n",
       "  gold-url-2 gold-url-3                                               url1  \\\n",
       "0        NaN        NaN  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...   \n",
       "1        NaN        NaN  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/63...   \n",
       "2        NaN        NaN  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...   \n",
       "3        NaN        NaN  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b...   \n",
       "4        NaN        NaN  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...   \n",
       "\n",
       "                                                url2  \\\n",
       "0  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...   \n",
       "1  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...   \n",
       "2  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/b2...   \n",
       "3  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b...   \n",
       "4  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...   \n",
       "\n",
       "                                                url3     score1     score2  \\\n",
       "0  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...  45.986645  44.425552   \n",
       "1  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...  28.099304   27.13894   \n",
       "2  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/b2...  50.218422  48.836723   \n",
       "3  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/6b...  53.618404  50.663227   \n",
       "4  [https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e...    25.9769  22.494854   \n",
       "\n",
       "      score3  \n",
       "0  40.902496  \n",
       "1  26.921707  \n",
       "2  48.836723  \n",
       "3   49.49715  \n",
       "4  22.246145  "
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_sap.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "58859907",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sap.to_excel('SAPBool_prefixwithoutBoost.xlsx')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "9894ad74",
   "metadata": {},
   "source": [
    "### Execute Queries over the entire dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "a8dde2ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Run the query :\n",
    "for ind in df_sap.index:\n",
    "    #print(ind)\n",
    "    #print(df_sap['question'][ind],df_sap['Category'][ind])\n",
    "    question =df_sap['question'][ind]\n",
    "    cat = df_sap['Category'][ind]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "57f49f4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_query1 = {\n",
    "   \"query\":{\n",
    "      \"text_expansion\":{\n",
    "         \"ml.tokens\":{\n",
    "            \"model_id\":\".elser_model_1\",\n",
    "            \"model_text\":\"Where do I work on purchase contracts contracts?\",\n",
    "            \"boost\":10\n",
    "         }\n",
    "      }\n",
    "   },\n",
    "  \"min_score\": 10 ,\n",
    "  \"fields\": [\n",
    "    \"filePath\",\n",
    "    \"url\"\n",
    "  ],\n",
    "  \"_source\": False\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "7d4d5058",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n"
     ]
    }
   ],
   "source": [
    "# Perform the initial search to obtain the first batch of results\n",
    "response = es.search(\n",
    "    index='elser-s4hana-business-39k-512t-256t-2023-07-26',\n",
    "    body=search_query1,\n",
    "    scroll='5m',  # Set the scroll timeout (e.g., 5 minutes)\n",
    "    size=3  # Set the number of documents to retrieve per scroll\n",
    ")\n",
    "all_hits = response['hits']['hits']\n",
    "print(len(all_hits))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "76e441dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DOC ID: hana_05638.txt-0-17 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_05638.txt-0-17', '_score': 158.58463, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9f1469daf04bd894ab2167f8132a1a/400b07d741274f4893372189181d1480.html'], 'filePath': ['400b07d741274f4893372189181d1480.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 158.58463 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/bb9f1469daf04bd894ab2167f8132a1a/400b07d741274f4893372189181d1480.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29224.txt-16429-18681 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29224.txt-16429-18681', '_score': 151.82756, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'], 'filePath': ['a8bbfa839c394505a7557f8171637d1a.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 151.82756 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29278.txt-19063-20135 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29278.txt-19063-20135', '_score': 150.44446, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/0b59376b880845b8a190bd5a339eed14.html'], 'filePath': ['0b59376b880845b8a190bd5a339eed14.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 150.44446 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/0b59376b880845b8a190bd5a339eed14.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29224.txt-15364-17607 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29224.txt-15364-17607', '_score': 149.83897, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'], 'filePath': ['a8bbfa839c394505a7557f8171637d1a.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 149.83897 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_04997.txt-2054-4041 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_04997.txt-2054-4041', '_score': 149.09933, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a86a51fe2f1b423cac02a0a70d133c9e.html'], 'filePath': ['a86a51fe2f1b423cac02a0a70d133c9e.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 149.09933 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a86a51fe2f1b423cac02a0a70d133c9e.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29224.txt-9733-11908 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29224.txt-9733-11908', '_score': 147.61778, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'], 'filePath': ['a8bbfa839c394505a7557f8171637d1a.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 147.61778 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29278.txt-11211-13619 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29278.txt-11211-13619', '_score': 146.75902, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/0b59376b880845b8a190bd5a339eed14.html'], 'filePath': ['0b59376b880845b8a190bd5a339eed14.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 146.75902 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/0b59376b880845b8a190bd5a339eed14.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29224.txt-9039-10933 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29224.txt-9039-10933', '_score': 146.34702, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'], 'filePath': ['a8bbfa839c394505a7557f8171637d1a.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 146.34702 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29224.txt-14469-16516 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29224.txt-14469-16516', '_score': 145.71216, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'], 'filePath': ['a8bbfa839c394505a7557f8171637d1a.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 145.71216 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/a8bbfa839c394505a7557f8171637d1a.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n",
      "DOC ID: hana_29278.txt-14268-16388 ---> {'_index': 'elser-s4hana-business-39k-512t-256t-2023-07-26', '_id': 'hana_29278.txt-14268-16388', '_score': 145.62083, 'fields': {'url': ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/0b59376b880845b8a190bd5a339eed14.html'], 'filePath': ['0b59376b880845b8a190bd5a339eed14.html']}} <class 'dict'> \n",
      "\n",
      "DOC Score: 145.62083 \n",
      "\n",
      "DOC url : ['https://help.sap.com/docs/SAP_S4HANA_CLOUD/0e602d466b99490187fcbb30d1dc897c/0b59376b880845b8a190bd5a339eed14.html'] \n",
      "\n",
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# iterate the nested dictionaries inside the [\"hits\"][\"hits\"] list\n",
    "for num, doc in enumerate(all_hits):\n",
    "    print (\"DOC ID:\", doc[\"_id\"], \"--->\", doc, type(doc), \"\\n\")\n",
    "    print (\"DOC Score:\", doc[\"_score\"], \"\\n\")\n",
    "    print (\"DOC url :\", doc[\"fields\"]['url'], \"\\n\")\n",
    "    \n",
    "    #print (\"DOC URL:\", doc[\"_source\"][\"url\"], \"--->\", \"\\n\")\n",
    "    #print (\"DOC Source:\", doc[\"_source\"][\"published_source\"], \"\\n\")\n",
    "    #print (\"keywords:\", doc[\"_source\"][\"keywords\"], \"\\n\")\n",
    "    #print (\"DOC content:\", doc[\"_source\"][\"content\"], \"\\n\")\n",
    "    \n",
    "\n",
    "    # print a few spaces between each doc for readability\n",
    "    print (\"\\n\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
